import os
import shutil
import subprocess
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from keras import layers
from colorama import Fore, Style
from IPython.core.display import HTML
2024-04-08 10:14:27.321908: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2024-04-08 10:14:31.189267: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
easy_dataset_path = Path("/home/azureuser/cloudfiles/code/Users/NAZAR.KHOKHLA.2023/sentences.csv")
easy_dataset = pd.read_csv(easy_dataset_path, encoding="utf-8", engine="pyarrow")
easy_dataset = easy_dataset.sample(len(easy_dataset), random_state=42)
easy_dataset.head()
| English | The Other Language | |
|---|---|---|
| 93238 | A cold snap is expected this week. | Bu hafta bir soğuk hava dalgası bekleniyor. |
| 118815 | Tom was married to Mary for three years. | Tom üç yıldır Mary ile evliydi. |
| 4568 | We all know you. | Biz hepimiz seni tanıyoruz. |
| 93723 | Have you ever eaten monkey brains? | Hiç maymun beyni yedin mi? |
| 49845 | I bought half a dozen eggs. | Yarım düzine yumurta aldım. |
easy_dataset.info()
<class 'pandas.core.frame.DataFrame'> Index: 145105 entries, 93238 to 121958 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 English 145105 non-null object 1 The Other Language 145105 non-null object dtypes: object(2) memory usage: 3.3+ MB
easy_dataset["English Words in Sentence"] = (
easy_dataset["English"].str.split().apply(len)
)
easy_dataset["Turkish Words in Sentence"] = (
easy_dataset["The Other Language"].str.split().apply(len)
)
fig = px.histogram(
easy_dataset,
x=["English Words in Sentence", "Turkish Words in Sentence"],
color_discrete_sequence=["#3f384a", "#e04c5f"],
labels={"variable": "Variable", "value": "Words in Sentence"},
marginal="box",
barmode="group",
height=540,
width=840,
title="Easy Dataset - Words in Sentence",
)
fig.update_layout(
font_color="#141B4D",
title_font_size=18,
plot_bgcolor="#F6F5F5",
paper_bgcolor="#F6F5F5",
bargap=0.2,
bargroupgap=0.1,
legend=dict(orientation="h", yanchor="bottom", xanchor="right", y=1.02, x=1),
yaxis_title="Count",
)
fig.show()
sentences_en = easy_dataset["English"].to_numpy()
sentences_tr = easy_dataset["The Other Language"].to_numpy()
valid_fraction = 0.1
valid_len = int(valid_fraction * len(easy_dataset))
sentences_en_train = sentences_en[:-valid_len]
sentences_tr_train = sentences_tr[:-valid_len]
sentences_en_valid = sentences_en[-valid_len:]
sentences_tr_valid = sentences_tr[-valid_len:]
def prepare_input_and_target(sentences_en, sentences_tr):
"""Return data in the format: `((encoder_input, decoder_input), target)`"""
return (sentences_en, b"startofseq " + sentences_tr), sentences_tr + b" endofseq"
def from_sentences_dataset(
sentences_en,
sentences_tr,
batch_size=32,
cache=True,
shuffle=False,
shuffle_buffer_size=10_000,
seed=None,
):
"""Creates `TensorFlow` dataset for encoder-decoder RNN from given sentences."""
dataset = tf.data.Dataset.from_tensor_slices((sentences_en, sentences_tr))
dataset = dataset.map(prepare_input_and_target, num_parallel_calls=tf.data.AUTOTUNE)
if cache:
dataset = dataset.cache()
if shuffle:
dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
return dataset.batch(batch_size)
benchmark_ds = from_sentences_dataset(sentences_en_train, sentences_tr_train)
benchmark_ds = benchmark_ds.prefetch(tf.data.AUTOTUNE)
bench_results = tfds.benchmark(benchmark_ds, batch_size=32)
************ Summary ************ Examples/sec (First included) 40614.10 ex/sec (total: 130656 ex, 3.22 sec) Examples/sec (First only) 270.53 ex/sec (total: 32 ex, 0.12 sec) Examples/sec (First excluded) 42154.14 ex/sec (total: 130624 ex, 3.10 sec)
100%|██████████| 4082/4082 [00:03<00:00, 1312.97it/s]
example_ds = from_sentences_dataset(
sentences_en_train, sentences_tr_train, batch_size=4
)
list(example_ds.take(1))[0]
2024-04-08 10:14:49.560787: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
((<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'A cold snap is expected this week.',
b'Tom was married to Mary for three years.', b'We all know you.',
b'Have you ever eaten monkey brains?'], dtype=object)>,
<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'startofseq Bu hafta bir so\xc4\x9fuk hava dalgas\xc4\xb1 bekleniyor.',
b'startofseq Tom \xc3\xbc\xc3\xa7 y\xc4\xb1ld\xc4\xb1r Mary ile evliydi.',
b'startofseq Biz hepimiz seni tan\xc4\xb1yoruz.',
b'startofseq Hi\xc3\xa7 maymun beyni yedin mi?'], dtype=object)>),
<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'Bu hafta bir so\xc4\x9fuk hava dalgas\xc4\xb1 bekleniyor. endofseq',
b'Tom \xc3\xbc\xc3\xa7 y\xc4\xb1ld\xc4\xb1r Mary ile evliydi. endofseq',
b'Biz hepimiz seni tan\xc4\xb1yoruz. endofseq',
b'Hi\xc3\xa7 maymun beyni yedin mi? endofseq'], dtype=object)>)
example_ds.cardinality() # Number of batches per epoch.
<tf.Tensor: shape=(), dtype=int64, numpy=32649>
CLR = (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
class ColoramaVerbose(keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
print(
f"{CLR}Epoch: {RED}{epoch + 1:02d}{CLR} -",
f"{CLR}loss: {RED}{logs['loss']:.5f}{CLR} -",
f"{CLR}accuracy: {RED}{logs['accuracy']:.5f}{CLR} -",
f"{CLR}val_loss: {RED}{logs['val_loss']:.5f}{CLR} -",
f"{CLR}val_accuracy: {RED}{logs['val_accuracy']:.5f}",
)
def adapt_compile_and_fit(
model,
train_dataset,
valid_dataset,
n_epochs=25,
n_patience=5,
init_lr=0.001,
lr_decay_rate=0.1,
colorama_verbose=False,
):
"""Takes the model vectorization layers and adapts them to the training data.
Then, it prepares the final datasets vectorizing targets and prefetching,
and finally trains the given model. Additionally, provides learning rate scheduling
(exponential decay), early stopping and colorama verbose."""
model.vectorization_en.adapt(
train_dataset.map(
lambda sentences, target: sentences[0], # English sentences.
num_parallel_calls=tf.data.AUTOTUNE,
)
)
model.vectorization_tr.adapt(
train_dataset.map(
lambda sentences, target: sentences[1] + b" endofseq", # Turkish sentences.
num_parallel_calls=tf.data.AUTOTUNE,
)
)
train_dataset_prepared = train_dataset.map(
lambda sentences, target: (sentences, model.vectorization_tr(target)),
num_parallel_calls=tf.data.AUTOTUNE,
).prefetch(tf.data.AUTOTUNE)
valid_dataset_prepared = valid_dataset.map(
lambda sentences, target: (sentences, model.vectorization_tr(target)),
num_parallel_calls=tf.data.AUTOTUNE,
).prefetch(tf.data.AUTOTUNE)
early_stopping_cb = keras.callbacks.EarlyStopping(
monitor="val_accuracy", patience=n_patience, restore_best_weights=True
)
# The line below doesn't work with multi-file interleaving.
# n_decay_steps = n_epochs * train_dataset_prepared.cardinality().numpy()
# Less elegant solution.
n_decay_steps = n_epochs * len(list(train_dataset_prepared))
scheduled_lr = keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=init_lr,
decay_steps=n_decay_steps,
decay_rate=lr_decay_rate,
)
model_callbacks = [early_stopping_cb]
verbose_level = 1
if colorama_verbose:
model_callbacks.append(ColoramaVerbose())
verbose_level = 0
model.compile(
loss="sparse_categorical_crossentropy",
optimizer=keras.optimizers.RMSprop(learning_rate=scheduled_lr),
metrics=["accuracy"],
)
return model.fit(
train_dataset_prepared,
epochs=n_epochs,
validation_data=valid_dataset_prepared,
callbacks=model_callbacks,
verbose=verbose_level,
)
def translate(model, sentence_en):
translation = ""
for word_idx in range(model.max_sentence_len):
X_encoder = np.array([sentence_en])
X_decoder = np.array(["startofseq " + translation])
# Last token's probas.
y_proba = model.predict((X_encoder, X_decoder), verbose=0)[0, word_idx]
predicted_word_id = np.argmax(y_proba)
predicted_word = model.vectorization_tr.get_vocabulary()[predicted_word_id]
if predicted_word == "endofseq":
break
translation += " " + predicted_word
return translation.strip()
class BidirectionalEncoderDecoderWithAttention(keras.Model):
def __init__(
self,
vocabulary_size=5000,
max_sentence_len=50,
embedding_size=256,
n_units_lstm=512,
**kwargs,
):
super().__init__(**kwargs)
self.max_sentence_len = max_sentence_len
self.vectorization_en = layers.TextVectorization(
vocabulary_size, output_sequence_length=max_sentence_len
)
self.vectorization_tr = layers.TextVectorization(
vocabulary_size, output_sequence_length=max_sentence_len
)
self.encoder_embedding = layers.Embedding(
vocabulary_size, embedding_size, mask_zero=True
)
self.decoder_embedding = layers.Embedding(
vocabulary_size, embedding_size, mask_zero=True
)
self.encoder = layers.Bidirectional(
layers.LSTM(n_units_lstm // 2, return_sequences=True, return_state=True)
)
self.decoder = layers.LSTM(n_units_lstm, return_sequences=True)
self.attention = layers.Attention()
self.output_layer = layers.Dense(vocabulary_size, activation="softmax")
def call(self, inputs):
encoder_inputs, decoder_inputs = inputs
encoder_input_ids = self.vectorization_en(encoder_inputs)
decoder_input_ids = self.vectorization_tr(decoder_inputs)
encoder_embeddings = self.encoder_embedding(encoder_input_ids)
decoder_embeddings = self.decoder_embedding(decoder_input_ids)
# The final hidden state of the encoder, representing the entire
# input sequence, is used to initialize the decoder.
encoder_output, *encoder_state = self.encoder(encoder_embeddings)
encoder_state = [
tf.concat(encoder_state[0::2], axis=-1), # Short-term state (0 & 2).
tf.concat(encoder_state[1::2], axis=-1), # Long-term state (1 & 3).
]
decoder_output = self.decoder(decoder_embeddings, initial_state=encoder_state)
attention_output = self.attention([decoder_output, encoder_output])
return self.output_layer(attention_output)
keras.backend.clear_session() # Resets all state generated by Keras.
tf.random.set_seed(42) # Ensure reproducibility on CPU.
easy_train_ds = from_sentences_dataset(
sentences_en_train, sentences_tr_train, shuffle=True, seed=42
)
easy_valid_ds = from_sentences_dataset(sentences_en_valid, sentences_tr_valid)
bidirect_encoder_decoder = BidirectionalEncoderDecoderWithAttention(max_sentence_len=15)
bidirect_history = adapt_compile_and_fit(
bidirect_encoder_decoder,
easy_train_ds,
easy_valid_ds,
init_lr=0.01,
lr_decay_rate=0.01,
colorama_verbose=True,
)
2024-04-08 10:15:07.435782: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2593 num_cores: 4 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 1048576 l3_cache_size: 37486592 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2024-04-08 10:25:28.540718: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2593 num_cores: 4 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 1048576 l3_cache_size: 37486592 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
Epoch: 01 - loss: 3.32274 - accuracy: 0.43764 - val_loss: 2.61784 - val_accuracy: 0.50026 Epoch: 02 - loss: 2.35497 - accuracy: 0.52489 - val_loss: 2.29117 - val_accuracy: 0.53278 Epoch: 03 - loss: 2.00593 - accuracy: 0.56653 - val_loss: 2.12529 - val_accuracy: 0.55982 Epoch: 04 - loss: 1.76258 - accuracy: 0.60130 - val_loss: 2.01952 - val_accuracy: 0.57956 Epoch: 05 - loss: 1.56285 - accuracy: 0.63339 - val_loss: 1.96088 - val_accuracy: 0.59106 Epoch: 06 - loss: 1.38447 - accuracy: 0.66480 - val_loss: 1.91254 - val_accuracy: 0.60183 Epoch: 07 - loss: 1.22390 - accuracy: 0.69614 - val_loss: 1.88595 - val_accuracy: 0.61239 Epoch: 08 - loss: 1.07653 - accuracy: 0.72675 - val_loss: 1.88504 - val_accuracy: 0.61764 Epoch: 09 - loss: 0.94293 - accuracy: 0.75717 - val_loss: 1.89759 - val_accuracy: 0.61966 Epoch: 10 - loss: 0.82661 - accuracy: 0.78585 - val_loss: 1.90609 - val_accuracy: 0.62380 Epoch: 11 - loss: 0.72516 - accuracy: 0.81215 - val_loss: 1.93893 - val_accuracy: 0.62450 Epoch: 12 - loss: 0.63992 - accuracy: 0.83526 - val_loss: 1.96640 - val_accuracy: 0.62589 Epoch: 13 - loss: 0.56886 - accuracy: 0.85613 - val_loss: 1.99304 - val_accuracy: 0.62742 Epoch: 14 - loss: 0.51006 - accuracy: 0.87297 - val_loss: 2.02849 - val_accuracy: 0.62658 Epoch: 15 - loss: 0.46252 - accuracy: 0.88672 - val_loss: 2.05606 - val_accuracy: 0.62650
fig = px.line(
bidirect_history.history,
markers=True,
height=540,
width=840,
symbol="variable",
labels={"variable": "Variable", "value": "Value", "index": "Epoch"},
title="Easy Dataset - Encoder-Decoder RNN Training Process",
color_discrete_sequence=px.colors.diverging.balance_r,
)
fig.update_layout(
font_color="#141B4D",
title_font_size=18,
plot_bgcolor="#F6F5F5",
paper_bgcolor="#F6F5F5",
)
fig.show()
translation1 = translate(bidirect_encoder_decoder, "Tell Tom to take a seat.")
translation2 = translate(bidirect_encoder_decoder, "I wish Tom were dead.")
translation3 = translate(bidirect_encoder_decoder, "She ordered three dinners.")
print("Tell Tom to take a seat. Tom'a oturmasını söyle.")
print("I wish Tom were dead. Keşke Tom ölmüş olsa.")
print("She ordered three dinners. O üç tane akşam yemeği sipariş etti.")
print()
print("Model Translations:")
print(translation1)
print(translation2)
print(translation3)
Tell Tom to take a seat. Tom'a oturmasını söyle. I wish Tom were dead. Keşke Tom ölmüş olsa. She ordered three dinners. O üç tane akşam yemeği sipariş etti. Model Translations: toma [UNK] söyle keşke tom ölmüş olsa o [UNK] [UNK] etti
import mlflow
from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
loss_history =bidirect_history.history['loss']
# Assuming `train_dataset` and `bidirect_encoder_decoder` are available
model = bidirect_encoder_decoder
train_dataset = easy_train_ds
# Adapt the vectorization layers
model.vectorization_en.adapt(
train_dataset.map(
lambda sentences, target: sentences[0], # English sentences.
num_parallel_calls=tf.data.AUTOTUNE,
)
)
model.vectorization_tr.adapt(
train_dataset.map(
lambda sentences, target: sentences[1] + b" endofseq", # Turkish sentences.
num_parallel_calls=tf.data.AUTOTUNE,
)
)
train_dataset_prepared = train_dataset.map(
lambda sentences, target: (sentences, model.vectorization_tr(target)),
num_parallel_calls=tf.data.AUTOTUNE,
).prefetch(tf.data.AUTOTUNE)
mlflow.set_experiment("Your-First-AI")
with mlflow.start_run( run_name='run1'):
for loss in loss_history:
mlflow.log_metric("loss", loss)
signature = infer_signature(np.array(["Tell Tom to take a seat."]), translate(bidirect_encoder_decoder, "Tell Tom to take a seat."))
conda_env = _mlflow_conda_env(
additional_conda_deps=None,
additional_pip_deps=None,
additional_conda_channels=None
)
mlflow.tensorflow.log_model(bidirect_encoder_decoder, "bidirect_encoder_decoder", conda_env=conda_env, signature=signature)
2024/04/08 14:37:53 INFO mlflow.tracking.fluent: Experiment with name 'Your-First-AI' does not exist. Creating a new experiment. /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.
--------------------------------------------------------------------------- MlflowException Traceback (most recent call last) Cell In[22], line 12 6 signature = infer_signature("Tell Tom to take a seat.", translate(bidirect_encoder_decoder, "Tell Tom to take a seat.")) 7 conda_env = _mlflow_conda_env( 8 additional_conda_deps=None, 9 additional_pip_deps=None, 10 additional_conda_channels=None 11 ) ---> 12 mlflow.tensorflow.log_model(bidirect_encoder_decoder, "bidirect_encoder_decoder", conda_env=conda_env, signature=signature) File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/mlflow/tensorflow/__init__.py:226, in log_model(model, artifact_path, custom_objects, conda_env, code_paths, signature, input_example, registered_model_name, await_registration_for, pip_requirements, extra_pip_requirements, saved_model_kwargs, keras_model_kwargs, metadata) 128 @format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) 129 def log_model( 130 model, (...) 143 metadata=None, 144 ): 145 """ 146 Log a TF2 core model (inheriting tf.Module) or a Keras model in MLflow Model format. 147 (...) 223 metadata of the logged model. 224 """ --> 226 return Model.log( 227 artifact_path=artifact_path, 228 flavor=mlflow.tensorflow, 229 model=model, 230 conda_env=conda_env, 231 code_paths=code_paths, 232 custom_objects=custom_objects, 233 registered_model_name=registered_model_name, 234 signature=signature, 235 input_example=input_example, 236 await_registration_for=await_registration_for, 237 pip_requirements=pip_requirements, 238 extra_pip_requirements=extra_pip_requirements, 239 saved_model_kwargs=saved_model_kwargs, 240 keras_model_kwargs=keras_model_kwargs, 241 metadata=metadata, 242 ) File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/mlflow/models/model.py:562, in Model.log(cls, artifact_path, flavor, registered_model_name, await_registration_for, metadata, **kwargs) 560 run_id = mlflow.tracking.fluent._get_or_start_run().info.run_id 561 mlflow_model = cls(artifact_path=artifact_path, run_id=run_id, metadata=metadata) --> 562 flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs) 563 mlflow.tracking.fluent.log_artifacts(local_path, mlflow_model.artifact_path) 564 tracking_uri = _resolve_tracking_uri() File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/mlflow/tensorflow/__init__.py:385, in save_model(model, path, conda_env, code_paths, mlflow_model, custom_objects, signature, input_example, pip_requirements, extra_pip_requirements, saved_model_kwargs, keras_model_kwargs, metadata) 383 for field in signature.inputs.inputs: 384 if not isinstance(field, TensorSpec): --> 385 raise MlflowException( 386 "All fields in the model signature's input schema must be of type TensorSpec.", 387 error_code=INVALID_PARAMETER_VALUE, 388 ) 389 if field.shape[0] != -1: 390 raise MlflowException( 391 "All fields in the model signature's input schema must have a shape " 392 "in which the first dimension is a variable dimension.", 393 error_code=INVALID_PARAMETER_VALUE, 394 ) MlflowException: All fields in the model signature's input schema must be of type TensorSpec.